'''
 爬取国际商事法庭|CICC-法律资源
 网址：https://cicc.court.gov.cn/html/1/218/62/index.html

'''

import requests
# 引入xpath 解析html
from lxml import etree
# 引入json
import json

url = 'https://cicc.court.gov.cn/html/1/218/62/index.html'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'
}

response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
print(response.content)
parseHtml = etree.HTML(response.content)
# 获取标题和日期
titleList = parseHtml.xpath('//div[@class="col-md-12"]//ul/li/a/span[1]/text()')
dateList = parseHtml.xpath('//div[@class="col-md-12"]//ul/li/a/span[2]/text()')
print(titleList)
# list的长度
listLength = len(titleList)
# for index in range(len(titleList)):
#     print(titleList[index] + '  '+ dateList[index])
with open('text.json', 'w', encoding='utf-8') as fp:
    for index in range(listLength):
        if 0 == index:
            fp.write("[")
        data = {'title': titleList[index], 'date': dateList[index]}
        str = json.dumps(data, ensure_ascii=False)+","
        fp.write(str)
        if listLength-1 == index:
            fp.write("]")

# 下一页地址
nextPageUrl = parseHtml.xpath('//div[@class="col-md-12"]//p/a[3]/@href')[0]
print(nextPageUrl)

# 末页地址
lastPageUrl = parseHtml.xpath('//div[@class="col-md-12"]//p/a[4]/@href')[0]
print(lastPageUrl)

# 判断是否到最后一页
if nextPageUrl == lastPageUrl:
    print('爬取完成')
else:
    print('爬取分页')

